# summary table
library(readr)
library(dplyr)
library(kableExtra)
library(tidyr)
library(writexl)

# read in the data 
# should have worked with one dataset from the beginning
# but: these datasets have different scope conditions / missings etc. so
# I started of with three datsets. And now: path dependency

h1 <- read_csv("data/epro_data_h1.csv")
h2 <- read_csv("data/epro_data_h2.csv")
h3 <- read_csv("data/epro_geo_data_h3.csv")

h1 <- h1 %>% select(resource_and_agriculture, none_one_or_both_nats_agri_char, gwgroupid, year) %>%
  mutate(no_streams = if_else(none_one_or_both_nats_agri_char == "None", 1, 0),
         single_income_stream = if_else(none_one_or_both_nats_agri_char == "Single", 1, 0),
         both_income_streams = if_else(none_one_or_both_nats_agri_char == "Both", 1, 0))

h2 <- h2 %>% select(n_ed_religions, hhi_rel, religious_segments_bin, gwgroupid, year, disagreement, share_disagreement)
h3 <- h3 %>% select(n_non_intersection_group_polygons, multiple_polygons_bin, spatial_hhi, gwgroupid, year, groupname, countryname)

# h2 has all obs
epro_cluster_data <- left_join(h2, h1, by = c("gwgroupid", "year"))
epro_cluster_data <- left_join(epro_cluster_data, h3, by = c("gwgroupid", "year"))


# make summary statistics
# https://stackoverflow.com/questions/72319408/dplyr-production-of-a-summary-descriptive-statistics-table-standard-error-and
epro_cluster_data %>%
  select("Disagreement" = disagreement,
         "Disagreement Share" = share_disagreement,
         "Natural resources and agriculture" =  resource_and_agriculture,
         "No income streams" = no_streams,
         "Single income stream" = single_income_stream,
         "Both income streams" = both_income_streams,
         "Number of religious segments" = n_ed_religions,
         "Religious fractionalization (HHI)" = hhi_rel,
         "Several religious segments (0/1)" = religious_segments_bin,
         "N settlement areas" =  n_non_intersection_group_polygons,
         "Several settlement areas (0/1)" = multiple_polygons_bin,
         "Geographic fractionalization (HHI)" = spatial_hhi) %>% 
  summarize(across(everything(), list(Mean = ~ mean(.x, na.rm = T), Min = ~ min(.x, na.rm = T), Max = ~ max(.x, na.rm = T), SD = ~ sd(.x, na.rm = T), "Observations" = ~ sum(!is.na(.x))))) %>%
  mutate(across(everything(), ~ round(.x, digits = 2))) %>%
  pivot_longer(everything(), names_sep = "_", names_to = c( "variable", ".value"))   %>%
  rename(" " = "variable") %>%
  kable("latex", booktabs = TRUE, linesep = "") %>%
  write_lines("tables/summary_table.tex")



# make table of positive cases
positive_cases <- epro_cluster_data %>%
                    filter(disagreement == 1 &
                           (resource_and_agriculture == 1 | religious_segments_bin == 1 | multiple_polygons_bin == 1)) %>%
                    group_by(gwgroupid, groupname, countryname) %>%
                    distinct(gwgroupid, .keep_all = TRUE) %>% # keep a random year
                    select("Disagreement" = disagreement,
                           "Group" = groupname,
                           "Country" = countryname,
                           "Natural resources and agriculture" =  resource_and_agriculture,
                           "Several religious segments (0/1)" = religious_segments_bin,
                           "Several settlement areas (0/1)" = multiple_polygons_bin)

write_xlsx(positive_cases, "data/positive_cases_table.xlsx")






